#load my standard toolbox
suppressWarnings(suppressMessages(library(tidyverse))) #required
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(lubridate)))
suppressWarnings(suppressMessages(library(forcats)))
suppressWarnings(suppressMessages(library(ggthemes)))
suppressWarnings(suppressMessages(library(reshape2)))
suppressPackageStartupMessages(library(dataMaid))
suppressPackageStartupMessages(library(ggplot2))

#extra tooling that is useful
suppressWarnings(suppressMessages(library(Hmisc))) #required
suppressWarnings(suppressMessages(library(pastecs))) #required
suppressWarnings(suppressMessages(library(stats))) #required

#some extras for mapping
suppressWarnings(suppressMessages(library(usmap))) #required
suppressWarnings(suppressMessages(library(maps)))
suppressWarnings(suppressMessages(library(mapdata)))
suppressWarnings(suppressMessages(library(ggmap)))
suppressWarnings(suppressMessages(library(ggrepel)))
suppressWarnings(suppressMessages(library(devtools)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(tcltk)))
suppressWarnings(suppressMessages(library(aplpack)))
suppressWarnings(suppressMessages(library(scales)))

#set print options
options(max.print=1000)
options(tibble.print_max = 1000, tibble.print_min = 1000)

#Session Info
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.3
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] tcltk     stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] scales_1.0.0    aplpack_1.3.2   knitr_1.21      usethis_1.4.0  
##  [5] devtools_2.0.1  ggrepel_0.8.0   ggmap_3.0.0     mapdata_2.3.0  
##  [9] maps_3.3.0      usmap_0.4.0     pastecs_1.3.21  Hmisc_4.2-0    
## [13] Formula_1.2-3   survival_2.42-3 lattice_0.20-35 dataMaid_1.2.0 
## [17] reshape2_1.4.3  ggthemes_4.0.1  lubridate_1.7.4 forcats_0.4.0  
## [21] stringr_1.4.0   dplyr_0.8.0.1   purrr_0.3.0     readr_1.3.1    
## [25] tidyr_0.8.2     tibble_2.0.1    ggplot2_3.1.0   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137        fs_1.2.6            bitops_1.0-6       
##  [4] RColorBrewer_1.1-2  httr_1.4.0          rprojroot_1.3-2    
##  [7] tools_3.5.1         backports_1.1.3     R6_2.4.0           
## [10] rpart_4.1-13        lazyeval_0.2.1      colorspace_1.4-0   
## [13] nnet_7.3-12         withr_2.1.2         tidyselect_0.2.5   
## [16] gridExtra_2.3       prettyunits_1.0.2   processx_3.2.1     
## [19] compiler_3.5.1      cli_1.0.1           rvest_0.3.2        
## [22] htmlTable_1.13.1    xml2_1.2.0          desc_1.2.0         
## [25] checkmate_1.9.1     DEoptimR_1.0-8      robustbase_0.93-3  
## [28] callr_3.1.1         digest_0.6.18       foreign_0.8-70     
## [31] rmarkdown_1.11      base64enc_0.1-3     jpeg_0.1-8         
## [34] pkgconfig_2.0.2     htmltools_0.3.6     sessioninfo_1.1.1  
## [37] htmlwidgets_1.3     rlang_0.3.1         readxl_1.3.0       
## [40] rstudioapi_0.9.0    generics_0.0.2      jsonlite_1.6       
## [43] acepack_1.4.1       magrittr_1.5        Matrix_1.2-14      
## [46] Rcpp_1.0.0          munsell_0.5.0       stringi_1.3.1      
## [49] yaml_2.2.0          pkgbuild_1.0.2      plyr_1.8.4         
## [52] grid_3.5.1          crayon_1.3.4        haven_2.0.0        
## [55] splines_3.5.1       pander_0.6.3        hms_0.4.2          
## [58] ps_1.3.0            pillar_1.3.1        boot_1.3-20        
## [61] rjson_0.2.20        pkgload_1.0.2       glue_1.3.0         
## [64] evaluate_0.13       latticeExtra_0.6-28 remotes_2.0.2      
## [67] data.table_1.12.0   modelr_0.1.4        png_0.1-7          
## [70] RgoogleMaps_1.4.3   cellranger_1.1.0    gtable_0.2.0       
## [73] assertthat_0.2.0    xfun_0.4            broom_0.5.1        
## [76] memoise_1.1.0       cluster_2.0.7-1
#set your working directory for this code to work properly
root_dir<-"/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1"
setwd(root_dir)
#setwd("/Users/Dhyan/Desktop/DoingDataScience/r_mustangs/Case1")

#echo `pwd`/`ls beers.csv`
#/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1/Data/beers.csv

#echo `pwd`/`ls breweries.csv`
#/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1/Data/Breweries.csv

####################################
## IMPORT RAW DATASETS (being explicit with col_types after inspecting the data)
####################################

beers <- read_csv(file = "Data/Beers.csv",
                  col_types = cols(
                    Name = col_character(),
                    Beer_ID = col_integer(),
                    ABV = col_double(),
                    IBU = col_integer(),
                    Brewery_id = col_integer(),
                    Style = col_character(),
                    Ounces = col_double()
                  )
)

breweries <- read_csv(file = "Data/Breweries.csv",
                      col_types = cols(
                        Brew_ID = col_integer(),
                        Name = col_character(),
                        City = col_character(),
                        State = col_character()
                      )
)

####################################
## RUN DATAMAID ANALYSIS REPORTS ON RAW DATA
####################################
setwd("Analysis")
  
  makeDataReport(beers, replace=TRUE)
  makeDataReport(breweries, replace=TRUE)

setwd(root_dir)





#check for problems
problems(beers)
## [1] row      col      expected actual  
## <0 rows> (or 0-length row.names)
problems(breweries)
## [1] row      col      expected actual  
## <0 rows> (or 0-length row.names)
#summary of tables
kable(summary(beers))
Name Beer_ID ABV IBU Brewery_id Style Ounces
Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00 Min. : 1.0 Length:2410 Min. : 8.40
Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 21.00 1st Qu.: 94.0 Class :character 1st Qu.:12.00
Mode :character Median :1453.5 Median :0.05600 Median : 35.00 Median :206.0 Mode :character Median :12.00
NA Mean :1431.1 Mean :0.05977 Mean : 42.71 Mean :232.7 NA Mean :13.59
NA 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 64.00 3rd Qu.:367.0 NA 3rd Qu.:16.00
NA Max. :2692.0 Max. :0.12800 Max. :138.00 Max. :558.0 NA Max. :32.00
NA NA NA’s :62 NA’s :1005 NA NA NA
kable(summary(breweries))
Brew_ID Name City State
Min. : 1.0 Length:558 Length:558 Length:558
1st Qu.:140.2 Class :character Class :character Class :character
Median :279.5 Mode :character Mode :character Mode :character
Mean :279.5 NA NA NA
3rd Qu.:418.8 NA NA NA
Max. :558.0 NA NA NA
#we have some NAs to worry about in ABV and IBU. brew_id max equals breweryid max which is good


#Question 1 - First we will find out how many breweries are in each state.
#For safety lets trim out any bad characters and whitespace
breweries$State <- str_trim(breweries$State)

#Find the count of Breweries by State
(breweries_count <- breweries %>%
    filter(breweries$State %in% state.abb) %>%
    count(State))
## # A tibble: 50 x 2
##    State     n
##    <chr> <int>
##  1 AK        7
##  2 AL        3
##  3 AR        2
##  4 AZ       11
##  5 CA       39
##  6 CO       47
##  7 CT        8
##  8 DE        2
##  9 FL       15
## 10 GA        7
## 11 HI        4
## 12 IA        5
## 13 ID        5
## 14 IL       18
## 15 IN       22
## 16 KS        3
## 17 KY        4
## 18 LA        5
## 19 MA       23
## 20 MD        7
## 21 ME        9
## 22 MI       32
## 23 MN       12
## 24 MO        9
## 25 MS        2
## 26 MT        9
## 27 NC       19
## 28 ND        1
## 29 NE        5
## 30 NH        3
## 31 NJ        3
## 32 NM        4
## 33 NV        2
## 34 NY       16
## 35 OH       15
## 36 OK        6
## 37 OR       29
## 38 PA       25
## 39 RI        5
## 40 SC        4
## 41 SD        1
## 42 TN        3
## 43 TX       28
## 44 UT        4
## 45 VA       16
## 46 VT       10
## 47 WA       23
## 48 WI       20
## 49 WV        1
## 50 WY        4
#Plot the number of Breweries for each State
#Show ggplot. Center title, with axis titles

ggplot(data = breweries_count, aes(x=State, y=n, fill=n)) +
  geom_bar(stat = "identity", colour="black") + 
  geom_text(aes(label=n, size=5, vjust = -0.5, hjust= 0.5), show.legend = FALSE) +
  ggtitle("Breweries By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="State", y="Breweries") +
  guides(fill = guide_legend(title = "Breweries")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold"))



Hopsy Question 1

How many breweries are present in each state?

As show in the above graph, the State population of Craft breweries varies widely depending on locality. For example, you have disparities even from “the State next door” such as 39 Breweries in California, and only 2 in Nevada. It is anticipated that these differences are largely due to State and Local laws and restrictions, population density, as well as material supply-chain and local preference factors for Craft Beer Brewing.



#Question 2 - We will now merge the beer and breweries datasets and verify the merge.
#rename column in breweries for simplicity on IDs
breweries <- rename(breweries, Brewery_id=Brew_ID)

#Merge both tables
merged_beers <- merge(beers, breweries, by='Brewery_id')

#Fix redundant names
merged_beers <- rename(merged_beers, BeerName=Name.x)
merged_beers <- rename(merged_beers, BrewerName=Name.y)

#View the structure of the new table
str(merged_beers)
## 'data.frame':    2410 obs. of  10 variables:
##  $ Brewery_id: int  1 1 1 1 1 1 2 2 2 2 ...
##  $ BeerName  : chr  "Get Together" "Maggie's Leap" "Wall's End" "Pumpion" ...
##  $ Beer_ID   : int  2692 2691 2690 2689 2688 2687 2686 2685 2684 2683 ...
##  $ ABV       : num  0.045 0.049 0.048 0.06 0.06 0.056 0.08 0.125 0.077 0.042 ...
##  $ IBU       : int  50 26 19 38 25 47 68 80 25 42 ...
##  $ Style     : chr  "American IPA" "Milk / Sweet Stout" "English Brown Ale" "Pumpkin Ale" ...
##  $ Ounces    : num  16 16 16 16 16 16 16 16 16 16 ...
##  $ BrewerName: chr  "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" ...
##  $ City      : chr  "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
##  $ State     : chr  "MN" "MN" "MN" "MN" ...
summary(merged_beers)
##    Brewery_id      BeerName            Beer_ID            ABV         
##  Min.   :  1.0   Length:2410        Min.   :   1.0   Min.   :0.00100  
##  1st Qu.: 94.0   Class :character   1st Qu.: 808.2   1st Qu.:0.05000  
##  Median :206.0   Mode  :character   Median :1453.5   Median :0.05600  
##  Mean   :232.7                      Mean   :1431.1   Mean   :0.05977  
##  3rd Qu.:367.0                      3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Max.   :558.0                      Max.   :2692.0   Max.   :0.12800  
##                                                      NA's   :62       
##       IBU            Style               Ounces       BrewerName       
##  Min.   :  4.00   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 35.00   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 42.71                      Mean   :13.59                     
##  3rd Qu.: 64.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##  NA's   :1005                                                          
##      City              State          
##  Length:2410        Length:2410       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
#We will view box plots of ABV and IBU to explore the dataset
ggplot(data=merged_beers) +
  geom_boxplot(mapping = aes(x=State, y=ABV, fill=State), outlier.colour="black") +
  theme_classic() +
  ggtitle("ABV By State") +
  theme(plot.title = element_text(hjust = 0.5)) 

ggplot(data=merged_beers) +
  geom_boxplot(mapping = aes(x=State, y=IBU, fill=State), outlier.colour="black") +
  theme_classic()  +
  ggtitle("IBU By State") +
  theme(plot.title = element_text(hjust = 0.5)) 

################
## Lets Make a Better BoxPlot For Use In Presentations
################
color_by_median <- merged_beers %>%
  group_by(State) %>%
  summarise(MedianABV = median(ABV, na.rm = TRUE), MedianIBU = median(IBU, na.rm = TRUE))

color_by_median <- left_join(color_by_median, merged_beers, by = c("State" = "State"))

ggplot(data=color_by_median) +
  geom_boxplot(mapping = aes(x=State, y=ABV, fill=MedianIBU), outlier.colour="black") +
  theme_classic()  +
  ggtitle("ABV By State") +
  theme(plot.title = element_text(hjust = 0.5))  +
  guides(fill = guide_legend(title = "Median IBU")) +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(color_by_median$ABV, na.rm=TRUE)+0.005))

#ABV sorted by top median
ggplot(data=color_by_median) +
  geom_boxplot(mapping = aes(x=reorder(color_by_median$State, color_by_median$MedianABV), y=ABV, fill=MedianIBU), outlier.colour="black") +
  theme_classic()  +
  ggtitle("Sorted Median ABV By State") +
  theme(plot.title = element_text(hjust = 0.5))  +
  guides(fill = guide_legend(title = "Median IBU")) +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(color_by_median$ABV, na.rm=TRUE)+0.005)) +
  labs(x="State", y="ABV")

ggplot(data=color_by_median) +
  geom_boxplot(mapping = aes(x=State, y=IBU, fill=MedianABV), outlier.colour="black") +
  theme_classic()  +
  ggtitle("IBU By State") +
  theme(plot.title = element_text(hjust = 0.5))  +
  guides(fill = guide_legend(title = "Median ABV")) +
  scale_y_continuous(limits = c(NA, max(color_by_median$IBU, na.rm=TRUE)+0.005)) +
  scale_fill_gradient( labels = percent)

#IBU sorted by top median
ggplot(data=color_by_median) +
  geom_boxplot(mapping = aes(x=reorder(color_by_median$State, color_by_median$MedianIBU), y=IBU, fill=MedianABV), outlier.colour="black") +
  theme_classic()  +
  ggtitle("Sorted Median IBU By State") +
  theme(plot.title = element_text(hjust = 0.5))  +
  guides(fill = guide_legend(title = "Median ABV")) +
  scale_y_continuous(limits = c(NA, max(color_by_median$IBU, na.rm=TRUE)+0.005)) +
  scale_fill_gradient( labels = percent) +
  labs(x="State", y="IBU")

#Draw a line at 50 so we can determine High Versus Low
#IBU sorted by top median w/line
ggplot(data=color_by_median) +
  geom_boxplot(mapping = aes(x=reorder(color_by_median$State, color_by_median$MedianIBU), y=IBU, fill=MedianABV), outlier.colour="black") +
  geom_abline(intercept = 50, slope = 0, na.rm=TRUE, linetype="dashed", size=1.5, col = "Red3") + 
  theme_classic()  +
  ggtitle("Sorted 50< Median IBU >50 By State") +
  theme(plot.title = element_text(hjust = 0.5))  +
  guides(fill = guide_legend(title = "Median ABV")) +
  scale_y_continuous(limits = c(NA, max(color_by_median$IBU, na.rm=TRUE)+0.005)) +
  scale_fill_gradient( labels = percent) +
  labs(x="State", y="IBU") +
  annotate("text", x=5, y=4, label = "<50 IBU = Malty", size=5, fontface = "bold", col="Red3") +
  annotate("text", x=5, y=120, label = ">50 IBU = Hoppy", size=5, fontface = "bold", col = "Green4")

#And we will find the count of brews by state
(brews_count <- merged_beers %>%
    group_by(State) %>%
    count(State))
## # A tibble: 51 x 2
## # Groups:   State [51]
##    State     n
##    <chr> <int>
##  1 AK       25
##  2 AL       10
##  3 AR        5
##  4 AZ       47
##  5 CA      183
##  6 CO      265
##  7 CT       27
##  8 DC        8
##  9 DE        2
## 10 FL       58
## 11 GA       16
## 12 HI       27
## 13 IA       30
## 14 ID       30
## 15 IL       91
## 16 IN      139
## 17 KS       23
## 18 KY       21
## 19 LA       19
## 20 MA       82
## 21 MD       21
## 22 ME       27
## 23 MI      162
## 24 MN       55
## 25 MO       42
## 26 MS       11
## 27 MT       40
## 28 NC       59
## 29 ND        3
## 30 NE       25
## 31 NH        8
## 32 NJ        8
## 33 NM       14
## 34 NV       11
## 35 NY       74
## 36 OH       49
## 37 OK       19
## 38 OR      125
## 39 PA      100
## 40 RI       27
## 41 SC       14
## 42 SD        7
## 43 TN        6
## 44 TX      130
## 45 UT       26
## 46 VA       40
## 47 VT       27
## 48 WA       68
## 49 WI       87
## 50 WV        2
## 51 WY       15
###################
## THIS CODE IS INCLUDED TO CREATE BAGPLOT FOR VISUALS
## PURELY FOR EXPLORATORY DATA ANALYSIS AND POTENTIAL PRESENTATION
###################
StatBag <- ggproto("Statbag", Stat,
                   compute_group = function(data, scales, prop = 0.5) {
                     
                     #################################
                     #################################
                     # originally from aplpack package, plotting functions removed
                     plothulls_ <- function(x, y, fraction, n.hull = 1,
                                            col.hull, lty.hull, lwd.hull, density=0, ...){
                       # function for data peeling:
                       # x,y : data
                       # fraction.in.inner.hull : max percentage of points within the hull to be drawn
                       # n.hull : number of hulls to be plotted (if there is no fractiion argument)
                       # col.hull, lty.hull, lwd.hull : style of hull line
                       # plotting bits have been removed, BM 160321
                       # pw 130524
                       if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
                       n <- length(x)
                       if(!missing(fraction)) { # find special hull
                         n.hull <- 1
                         if(missing(col.hull)) col.hull <- 1
                         if(missing(lty.hull)) lty.hull <- 1
                         if(missing(lwd.hull)) lwd.hull <- 1
                         x.old <- x; y.old <- y
                         idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
                         for( i in 1:(length(x)/3)){
                           x <- x[-idx]; y <- y[-idx]
                           if( (length(x)/n) < fraction ){
                             return(cbind(x.hull,y.hull))
                           }
                           idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
                         }
                       }
                       if(missing(col.hull)) col.hull <- 1:n.hull
                       if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
                       if(missing(lty.hull)) lty.hull <- 1:n.hull
                       if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
                       if(missing(lwd.hull)) lwd.hull <- 1
                       if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
                       result <- NULL
                       for( i in 1:n.hull){
                         idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
                         result <- c(result, list( cbind(x.hull,y.hull) ))
                         x <- x[-idx]; y <- y[-idx]
                         if(0 == length(x)) return(result)
                       }
                       result
                     } # end of definition of plothulls
                     #################################
                     
                     
                     # prepare data to go into function below
                     the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
                     
                     # get data out of function as df with names
                     setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
                     # how can we get the hull and loop vertices passed on also?
                   },
                   
                   required_aes = c("x", "y")
)

#' @inheritParams ggplot2::stat_identity
#' @param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
                     position = "identity", na.rm = FALSE, show.legend = NA, 
                     inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
  layer(
    stat = StatBag, data = data, mapping = mapping, geom = geom, 
    position = position, show.legend = show.legend, inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
  )
}


geom_bag <- function(mapping = NULL, data = NULL,
                     stat = "identity", position = "identity",
                     prop = 0.5, 
                     alpha = 0.3,
                     ...,
                     na.rm = FALSE,
                     show.legend = NA,
                     inherit.aes = TRUE) {
  layer(
    data = data,
    mapping = mapping,
    stat = StatBag,
    geom = GeomBag,
    position = position,
    show.legend = show.legend,
    inherit.aes = inherit.aes,
    params = list(
      na.rm = na.rm,
      alpha = alpha,
      prop = prop,
      ...
    )
  )
}

#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
GeomBag <- ggproto("GeomBag", Geom,
                   draw_group = function(data, panel_scales, coord) {
                     n <- nrow(data)
                     if (n == 1) return(zeroGrob())
                     
                     munched <- coord_munch(coord, data, panel_scales)
                     # Sort by group to make sure that colors, fill, etc. come in same order
                     munched <- munched[order(munched$group), ]
                     
                     # For gpar(), there is one entry per polygon (not one entry per point).
                     # We'll pull the first value from each group, and assume all these values
                     # are the same within each group.
                     first_idx <- !duplicated(munched$group)
                     first_rows <- munched[first_idx, ]
                     
                     ggplot2:::ggname("geom_bag",
                                      grid:::polygonGrob(munched$x, munched$y, default.units = "native",
                                                         id = munched$group,
                                                         gp = grid::gpar(
                                                           col = first_rows$colour,
                                                           fill = alpha(first_rows$fill, first_rows$alpha),
                                                           lwd = first_rows$size * .pt,
                                                           lty = first_rows$linetype
                                                         )
                                      )
                     )
                     
                     
                   },
                   
                   default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
                                     alpha = NA, prop = 0.5),
                   
                   handle_na = function(data, params) {
                     data
                   },
                   
                   required_aes = c("x", "y"),
                   
                   draw_key = draw_key_polygon
)




#create a clean space to work
bag_of_beers <- merged_beers

#find styles
bag_of_beers$Style <- str_trim(bag_of_beers$Style)
bag_of_beers$Style <- str_replace_all(bag_of_beers$Style, "[[:punct:]]", "")


style_counts<- bag_of_beers %>%
  group_by(Style) %>%
  count

style_counts<-style_counts[order(-style_counts$n), ]


#Find Quartiles
bag_of_beers_summary <- summary(bag_of_beers$ABV)
ABV_Q1 <- bag_of_beers_summary[2]
ABV_Q2 <- bag_of_beers_summary[3]
ABV_Q3 <- bag_of_beers_summary[5]

bag_of_beers_summary <- summary(bag_of_beers$IBU)
IBU_Q1 <- bag_of_beers_summary[2]
IBU_Q2 <- bag_of_beers_summary[3]
IBU_Q3 <- bag_of_beers_summary[5]

#find top styles
STYLE_1 <- as.character(style_counts[1,1])
STYLE_2 <- as.character(style_counts[2,1])
STYLE_3 <- as.character(style_counts[3,1])

STYLE_1 <- str_replace_all(STYLE_1, "[[:punct:]]", "")
STYLE_2 <- str_replace_all(STYLE_2, "[[:punct:]]", "")
STYLE_3 <- str_replace_all(STYLE_3, "[[:punct:]]", "")

bag_of_beers$IBU_class <- ifelse(bag_of_beers$IBU<=IBU_Q1, "< 1st Quartile", 
                                 ifelse(bag_of_beers$IBU>IBU_Q1 & bag_of_beers$IBU<=IBU_Q2, "< 2nd Quartile (Median)",
                                        ifelse(bag_of_beers$IBU>IBU_Q2 & bag_of_beers$IBU<=IBU_Q3, "< 3rd Quartile", 
                                               ifelse(bag_of_beers$IBU>IBU_Q3, "> 3rd Quartile", "none"))))

bag_of_beers$ABV_class <-  ifelse(bag_of_beers$ABV<=ABV_Q1, "< 1st Quartile", 
                                  ifelse(bag_of_beers$ABV>ABV_Q1 & bag_of_beers$ABV<=ABV_Q2, "< 2nd Quartile (Median)",
                                         ifelse(bag_of_beers$ABV>ABV_Q2 & bag_of_beers$ABV<=ABV_Q3, "< 3rd Quartile", 
                                                ifelse(bag_of_beers$ABV>ABV_Q3 & bag_of_beers$ABV<=1, "> 3rd Quartile", "None"))))

bag_of_beers$Style_class <- ifelse(str_detect(bag_of_beers$Style,STYLE_1)==TRUE, STYLE_1, 
                                 ifelse(str_detect(bag_of_beers$Style,STYLE_2)==TRUE, STYLE_2,
                                        ifelse(str_detect(bag_of_beers$Style, STYLE_3)==TRUE, STYLE_3, NA)))
                                              
#find style medians for ABV and IBU
style_medians <- merged_beers %>%
    group_by(Style) %>%
    summarise(MedianABV = median(ABV, na.rm = TRUE), MedianIBU = median(IBU, na.rm = TRUE))

style_medians <- left_join(style_medians, style_counts, by = c("Style" = "Style"))
#bag_of_beers <- left_join(bag_of_beers, style_medians, by = c("Style" = "Style"))


#CREATE BAGPLOTS
ggplot(data=bag_of_beers, aes(x=IBU, y=ABV, fill=ABV_class)) +
  geom_point() + 
  stat_bag(prop = 0.95) +  # enclose 95% of points
  stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
  stat_bag(prop = 0.05, alpha = 0.9) + # enclose 5% of points
  ggtitle("ABV Class Groups") +
  theme(plot.title = element_text(hjust = 0.5)) 

ggplot(data=bag_of_beers, aes(x=IBU, y=ABV, fill=IBU_class)) +
  geom_point() + 
  stat_bag(prop = 0.95) +  # enclose 95% of points
  stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
  stat_bag(prop = 0.05, alpha = 0.9) + # enclose 5% of points
  ggtitle("IBU Class Groups") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data=subset(bag_of_beers, !is.na(Style_class)), aes(x=IBU, y=ABV, fill=Style_class)) +
  geom_point() + 
  stat_bag(prop = 0.95) +  # enclose 95% of points
  stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
  stat_bag(prop = 0.05, alpha = 0.9) + # enclose 5% of points
  ggtitle("Top 3 Most Popular Style Class Groups") +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = "Top 3 Styles"))

#plot of ABV vs IBU for the Top x Styles

#choose x
num_rows<-5

ggplot(data=head(style_medians[order(-style_medians$n),],num_rows), 
       aes(x=MedianIBU, y=MedianABV, size=n, fill=Style, label = Style)) +
  geom_jitter(shape = 21)  +
 geom_text(aes(label=head(style_medians[order(-style_medians$n),],num_rows)$n), size=5
           , position = position_dodge(width=0), vjust = -1, hjust= 0
           #, position = position_nudge(y = 0.0015)
           ) +
  ggtitle(paste0("ABV vs IBU for the Top ", num_rows," Most Popular Styles"))+
  theme(plot.title = element_text(hjust = 1)) +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(style_medians$MedianABV)+0.005)) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
  labs(x = "Median IBU", y="Median ABV") +
  labs(caption = "Bubble size represents most popularly represented styles by number of beers in the provided data.") +
  scale_size_continuous(range=c(1, 30)) +
  scale_colour_continuous(guide = FALSE) +
  guides(fill = guide_legend(title = "Beer Style"))

#choose x
num_rows<-10

ggplot(data=head(style_medians[order(-style_medians$n),],num_rows), 
       aes(x=MedianIBU, y=MedianABV, size=n, fill=Style, label = Style)) +
  geom_jitter(shape = 21)  +
  geom_text(aes(label=head(style_medians[order(-style_medians$n),],num_rows)$n), size=5
            , position = position_dodge(width=0), vjust = -0.5, hjust= 0.1
            #, position = position_nudge(y = 0.0015)
  ) +
  ggtitle(paste0("ABV vs IBU for the Top ", num_rows," Most Popular Styles"))+
  theme(plot.title = element_text(hjust = 1)) +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(style_medians$MedianABV)+0.005)) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
  labs(x = "Median IBU", y="Median ABV") +
  labs(caption = "Bubble size represents most popularly represented styles by number of beers in the provided data.") +
  scale_size_continuous(range=c(1, 30)) +
  scale_colour_continuous(guide = FALSE) +
  guides(fill = guide_legend(title = "Beer Style"))

#choose x
num_rows<-5

ggplot(data=head(style_medians[order(-style_medians$n),],num_rows), 
       aes(x=MedianIBU, y=MedianABV, size=n, fill=Style, label = Style)) +
  geom_jitter(shape = 21)  +
  geom_text(aes(label=str_remove(head(style_medians[order(-style_medians$n),],num_rows)$Style, "American ")), size=5
            , position = position_dodge(width=0), vjust = -.06, hjust= 0
            #, position = position_nudge(y = 0.0015)
  ) +
  ggtitle(paste0("ABV vs IBU for the Top ", num_rows," Most Popular Styles"))+
  theme(plot.title = element_text(hjust = 1)) +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(style_medians$MedianABV)+0.005)) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
  labs(x = "Median IBU", y="Median ABV") +
  labs(caption = "Bubble size represents most popularly represented styles by number of beers in the provided data.") +
  scale_size_continuous(range=c(1, 30)) +
  scale_colour_continuous(guide = FALSE) +
  guides(fill = guide_legend(title = "Beer Style"))

##################
#Plot facets to view by state
#this might be helpful to view gaps in market or saturation
ggplot(data = merged_beers) +
  geom_jitter(mapping = aes(x=ABV, y=IBU)) +
  facet_wrap(~ State, nrow = 13)

#Finally, lets print first 6 lines and last six lines 
#so we can answer the question directly
kable(head(merged_beers, 6))
Brewery_id BeerName Beer_ID ABV IBU Style Ounces BrewerName City State
1 Get Together 2692 0.045 50 American IPA 16 NorthGate Brewing Minneapolis MN
1 Maggie’s Leap 2691 0.049 26 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis MN
1 Wall’s End 2690 0.048 19 English Brown Ale 16 NorthGate Brewing Minneapolis MN
1 Pumpion 2689 0.060 38 Pumpkin Ale 16 NorthGate Brewing Minneapolis MN
1 Stronghold 2688 0.060 25 American Porter 16 NorthGate Brewing Minneapolis MN
1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis MN
kable(tail(merged_beers, 6))
Brewery_id BeerName Beer_ID ABV IBU Style Ounces BrewerName City State
2405 556 Pilsner Ukiah 98 0.055 NA German Pilsener 12 Ukiah Brewing Company Ukiah CA
2406 557 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12 Butternuts Beer and Ale Garrattsville NY
2407 557 Snapperhead IPA 51 0.068 NA American IPA 12 Butternuts Beer and Ale Garrattsville NY
2408 557 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville NY
2409 557 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville NY
2410 558 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12 Sleeping Lady Brewing Company Anchorage AK


Hopsy Question 2

Please merge the data sets provided (beers & breweries).

As requested, the R Mustangs team merged the data sets provided. Although we found some gaps in the data, we were successful in pulling all data together for analysis.

Displayed above are various Exploratory Data Analysis (EDA) techniqiues we employed on the data to validate and understand what has been provided. Directly above, the first and last 6 lines of the merged dataset is provided for reference and understanding.



#Question 3 - We will find out how many NAs are in each column for the merged dataset, 
#so we can report it
kable(colSums(is.na(merged_beers)))
x
Brewery_id 0
BeerName 0
Beer_ID 0
ABV 62
IBU 1005
Style 5
Ounces 0
BrewerName 0
City 0
State 0
missing_values <- colSums(is.na(merged_beers))
missing_values <- as.data.frame(missing_values)

names(missing_values)[1] <- "MissingValues"
missing_values<-missing_values %>% rownames_to_column("Variable")


ggplot(data = missing_values, aes(x=Variable, y=MissingValues, fill=MissingValues)) +
  geom_bar(stat = "identity", colour="black") + 
  geom_text(aes(label=MissingValues, size=5, vjust = -0.5, hjust= 0.5), show.legend = FALSE) +
  ggtitle("Missing Values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Variables", y="MIssing Values") +
  guides(fill = guide_legend(title = "Dataset Missing Values")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold"))



Hopsy Question 3

Report on Missing Values

Shown above is a “Missing Values” report that shows where we have gaps in the existing data. While not ideal, we can still proceed with analysis and proceed with caution under this understanding. For the future, we would recommend review on data quality procedures in order to ensure the lineage and sanctity of the data sets wherever possible.



#Question 4 - We will compute the median alcohol content and international bitterness unit by state
#then plot the results
(medians <- merged_beers %>%
    group_by(State) %>%
    summarise(MedianABV = median(ABV, na.rm = TRUE), MedianIBU = median(IBU, na.rm = TRUE)))
## # A tibble: 51 x 3
##    State MedianABV MedianIBU
##    <chr>     <dbl>     <dbl>
##  1 AK       0.056       46  
##  2 AL       0.06        43  
##  3 AR       0.052       39  
##  4 AZ       0.055       20.5
##  5 CA       0.058       42  
##  6 CO       0.0605      40  
##  7 CT       0.06        29  
##  8 DC       0.0625      47.5
##  9 DE       0.055       52  
## 10 FL       0.057       55  
## 11 GA       0.055       55  
## 12 HI       0.054       22.5
## 13 IA       0.0555      26  
## 14 ID       0.0565      39  
## 15 IL       0.058       30  
## 16 IN       0.058       33  
## 17 KS       0.05        20  
## 18 KY       0.0625      31.5
## 19 LA       0.052       31.5
## 20 MA       0.054       35  
## 21 MD       0.058       29  
## 22 ME       0.051       61  
## 23 MI       0.062       35  
## 24 MN       0.056       44.5
## 25 MO       0.052       24  
## 26 MS       0.058       45  
## 27 MT       0.055       40  
## 28 NC       0.057       33.5
## 29 ND       0.05        32  
## 30 NE       0.056       35  
## 31 NH       0.055       48.5
## 32 NJ       0.046       34.5
## 33 NM       0.062       51  
## 34 NV       0.06        41  
## 35 NY       0.055       47  
## 36 OH       0.058       40  
## 37 OK       0.06        35  
## 38 OR       0.056       40  
## 39 PA       0.057       30  
## 40 RI       0.055       24  
## 41 SC       0.055       30  
## 42 SD       0.06        NA  
## 43 TN       0.0570      37  
## 44 TX       0.055       33  
## 45 UT       0.04        34  
## 46 VA       0.0565      42  
## 47 VT       0.055       30  
## 48 WA       0.0555      38  
## 49 WI       0.052       19  
## 50 WV       0.062       57.5
## 51 WY       0.05        21
#Plot ABV graph
#Show ggplot. Center title, with axis titles
ggplot(data = medians, aes(x=medians$State, y=medians$MedianABV, fill=medians$MedianABV)) +
  geom_bar(stat = "identity", colour="black") + 
  scale_fill_gradient2(labels = percent, midpoint=median(medians$MedianABV, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(medians$MedianABV)+0.005)) +
  ##sec.axis = sec_axis(~. * 25, name = "MedianIBU") +
  ggtitle("Median ABV By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="State", y="Median ABV in %") +
  guides(fill = guide_legend(title = "Median ABV")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold")) 

#Plot ABV sort graph
#Show ggplot. Center title, with axis titles
ggplot(data = medians, aes(x=reorder(medians$State, medians$MedianABV), y=medians$MedianABV, fill=medians$MedianABV)) +
  geom_bar(stat = "identity", colour="black") + 
  scale_fill_gradient2(labels = percent, midpoint=median(medians$MedianABV, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(medians$MedianABV)+0.005)) +
  ##sec.axis = sec_axis(~. * 25, name = "MedianIBU") +
  ggtitle("Median ABV By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="State", y="Median ABV in %") +
  guides(fill = guide_legend(title = "Median ABV")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold")) 

#Set chart title for mean
mean_ABV <- paste0("ABV Histogram: Mean = ",round(mean(merged_beers$ABV, na.rm = TRUE), digits = 3)*100, "%")

#Show Histogram to show distribution of values for ABV
ggplot(data=merged_beers, aes(merged_beers$ABV, fill=..count..)) + 
  geom_histogram(binwidth = .005) +
  geom_vline(aes(xintercept = mean(merged_beers$ABV, na.rm = TRUE)),col='black', size=2) +
  labs(x="ABV", y="Frequency") +
  guides(fill = guide_legend(title = "ABV Frequency")) +
  ggtitle(mean_ABV) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold"))

#Plot IBU graph
ggplot(data = medians, aes(x=medians$State, y=medians$MedianIBU, fill=medians$MedianIBU)) +
  geom_bar(stat = "identity", colour="black") + 
  scale_fill_gradient2(midpoint=median(medians$MedianIBU, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
  scale_y_continuous(limits = c(NA, max(medians$MedianIBU)+0.005)) +
  ggtitle("Median IBU By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="State", y="Median IBU") +
  guides(fill = guide_legend(title = "Median IBU")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold"))

#Plot IBU sort graph
ggplot(data = medians, aes(x=reorder(medians$State, medians$MedianIBU), y=medians$MedianIBU, fill=medians$MedianIBU)) +
  geom_bar(stat = "identity", colour="black") + 
  scale_fill_gradient2(midpoint=median(medians$MedianIBU, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
  scale_y_continuous(limits = c(NA, max(medians$MedianIBU)+0.005)) +
  ggtitle("Median IBU By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="State", y="Median IBU") +
  guides(fill = guide_legend(title = "Median IBU")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold"))

#Plot IBU sort graph
#Draw a line at 50 so we can determine High Versus Low
ggplot(data = medians, aes(x=reorder(medians$State, medians$MedianIBU), y=medians$MedianIBU, fill=medians$MedianIBU)) +
  geom_bar(stat = "identity", colour="black") + 
  geom_abline(intercept = 50, slope = 0, na.rm=TRUE, linetype="dashed", size=1.5, col = "Red3") + 
  scale_fill_gradient2(midpoint=median(medians$MedianIBU, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
  scale_y_continuous(limits = c(NA, max(medians$MedianIBU)+0.005)) +
  ggtitle("Median IBU By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="State", y="Median IBU") +
  guides(fill = guide_legend(title = "Median IBU")) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  theme(axis.title=element_text(face="bold")) +
  annotate("text", x=5, y=30, label = "<50 IBU = Malty", size=5, fontface = "bold", col="Red3") +
  annotate("text", x=5, y=80, label = ">50 IBU = Hoppy", size=5, fontface = "bold", col = "Green4")

#Set chart title for mean
mean_IBU <- paste0("IBU Histogram: Mean = ",round(mean(merged_beers$IBU, na.rm = TRUE), digits = 0))

#Show Histogram to show distribution of values for IBU
ggplot(data=merged_beers, aes(merged_beers$IBU, fill=..count..)) + 
  geom_histogram(binwidth = 5) +
  geom_vline(aes(xintercept = mean(merged_beers$IBU, na.rm = TRUE)),col='black',size=2) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
  labs(x="IBU Frequency", y="Frequency") +
  guides(fill = guide_legend(title = "IBU")) +
  ggtitle(mean_IBU) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold"))



Question 4

Compute the Median Alcohol content (ABV) and International Bitterness Unit (IBU) for each state.

Display graphically.

Displayed above are various graphs of both ABV and IBU as they relate to the United States. Please note the sorted graphs are the most informative for this exercise as they will help distinguish which states have what types of preferences for their locality. Alcohol content and bitterness preference vary in terms of regional tastes.



#Question 5 - Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

#Lets find out explictly which state has the highest ABV and IBU
#We will answer this with explicit data pulls below, plus graphically with existing charts

#State with max ABV
kable(merged_beers %>%
  filter(ABV == max(ABV, na.rm=TRUE)))
Brewery_id BeerName Beer_ID ABV IBU Style Ounces BrewerName City State
52 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128 NA Quadrupel (Quad) 19.2 Upslope Brewing Company Boulder CO
#Top 5 with max absolute ABV
kable(head(merged_beers[order(-merged_beers$ABV),], 5))
Brewery_id BeerName Beer_ID ABV IBU Style Ounces BrewerName City State
375 52 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128 NA Quadrupel (Quad) 19.2 Upslope Brewing Company Boulder CO
8 2 London Balling 2685 0.125 80 English Barleywine 16.0 Against the Grain Brewery Louisville KY
144 18 Csar 2621 0.120 90 Russian Imperial Stout 16.0 Tin Man Brewing Company Evansville IN
376 52 Lee Hill Series Vol. 4 - Manhattan Style Rye Ale 2564 0.104 NA Rye Beer 19.2 Upslope Brewing Company Boulder CO
336 47 4Beans 2574 0.100 52 Baltic Porter 12.0 Sixpoint Craft Ales Brooklyn NY
#State with max IBU
merged_beers %>%
  filter(IBU == max(IBU, na.rm=TRUE))
##   Brewery_id                  BeerName Beer_ID   ABV IBU
## 1        375 Bitter Bitch Imperial IPA     980 0.082 138
##                            Style Ounces              BrewerName    City
## 1 American Double / Imperial IPA     12 Astoria Brewing Company Astoria
##   State
## 1    OR
#Top 5 with max absolute IBU
kable(head(merged_beers[order(-merged_beers$IBU),], 5))
Brewery_id BeerName Beer_ID ABV IBU Style Ounces BrewerName City State
1857 375 Bitter Bitch Imperial IPA 980 0.082 138 American Double / Imperial IPA 12 Astoria Brewing Company Astoria OR
1719 345 Troopers Alley IPA 1676 0.059 135 American IPA 12 Wolf Hills Brewing Company Abingdon VA
1305 231 Dead-Eye DIPA 2067 0.090 130 American Double / Imperial IPA 16 Cape Ann Brewing Company Gloucester MA
625 100 Bay of Bengal Double IPA (2014) 2440 0.089 126 American Double / Imperial IPA 12 Christian Moerlein Brewing Company Cincinnati OH
431 62 Abrasive Ale 15 0.097 120 American Double / Imperial IPA 16 Surly Brewing Company Brooklyn Center MN
# MAX MEDIAN ABV STATES
kable(head(medians[order(-medians$MedianABV),], 5))
State MedianABV MedianIBU
DC 0.0625 47.5
KY 0.0625 31.5
MI 0.0620 35.0
NM 0.0620 51.0
WV 0.0620 57.5
# MAX MEDIAN IBU STATES
kable(head(medians[order(-medians$MedianIBU),], 5))
State MedianABV MedianIBU
ME 0.051 61.0
WV 0.062 57.5
FL 0.057 55.0
GA 0.055 55.0
DE 0.055 52.0


Question 5

Which state has the maximum alcoholic (ABV) beer?

Which state has the most bitter (IBU) beer?

The maximum alcohol content (ABV) beer is a beer from Colorado named the Lee Hill Series Vol 5 - Belgian Style Quadrupel Ale, from a brewer named Upslope Breweing Company. This beer has an ABV of 12.8%, which represents an outlier in terms of the overall sample of craft beers we reviewed.

The maximum bitterness (IBU) beer is a beer from Oregon named the Bitter Bitch Imperial IPA, from a brewer by the name of Astoria Brewing Company. Their reported IBU for that beer is 138, which also represents a very high number. This also may represent suspect data as the IBU rating is a rating that measures the amount of isomerized alpha acids in a beer. Humans can only detect up to 100, which is why typically IBU is reported on a 0 to 100 scale. We merely recommend keeping this in mind for the upper echelons of IBU.

There is nothing particularly wrong the reported 138 IBU as its still representative of hops alpha acids for the beer. However, for future research we need to keep in mind the upper bound of 100 for taste preferences. Its still correct to report 138 IBU as it relates to alcohol content as we will see later in the analysis.

For comprehensiveness we also find above the max Median ABV and max Median IBU states as well. District of Columbia and Kentucky are the maximum Median ABV territory and State (6.25% Alcohol), and Maine is the maximum Median IBU State (61 IBU), followed by West Virgina (57 IBU). This is also displayed above in tabular form.



#Question 6 - We will aggregate summary statistics for ABV
#Summary for ABV Stats

kable(as.array(summary(merged_beers$ABV)))
Var1 Freq
Min. 0.0010000
1st Qu. 0.0500000
Median 0.0560000
Mean 0.0597734
3rd Qu. 0.0670000
Max. 0.1280000
NA’s 62.0000000
describe(merged_beers$ABV)
## merged_beers$ABV 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2348       62       74    0.998  0.05977  0.01469    0.042    0.045 
##      .25      .50      .75      .90      .95 
##    0.050    0.056    0.067    0.080    0.087 
## 
## lowest : 0.001 0.027 0.028 0.032 0.034, highest: 0.100 0.104 0.120 0.125 0.128
kable(stat.desc(merged_beers$ABV))
x
nbr.val 2348.0000000
nbr.null 0.0000000
nbr.na 62.0000000
min 0.0010000
max 0.1280000
range 0.1270000
sum 140.3480000
median 0.0560000
mean 0.0597734
SE.mean 0.0002795
CI.mean.0.95 0.0005480
var 0.0001834
std.dev 0.0135417
coef.var 0.2265511
#We will do IBU for good measure
kable(as.array(summary(merged_beers$IBU)))
Var1 Freq
Min. 4.00000
1st Qu. 21.00000
Median 35.00000
Mean 42.71317
3rd Qu. 64.00000
Max. 138.00000
NA’s 1005.00000
describe(merged_beers$IBU)
## merged_beers$IBU 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1405     1005      107    0.999    42.71    28.81       11       15 
##      .25      .50      .75      .90      .95 
##       21       35       64       80       92 
## 
## lowest :   4   5   6   7   8, highest: 120 126 130 135 138
kable(stat.desc(merged_beers$IBU))
x
nbr.val 1.405000e+03
nbr.null 0.000000e+00
nbr.na 1.005000e+03
min 4.000000e+00
max 1.380000e+02
range 1.340000e+02
sum 6.001200e+04
median 3.500000e+01
mean 4.271317e+01
SE.mean 6.924162e-01
CI.mean.0.95 1.358282e+00
var 6.736135e+02
std.dev 2.595407e+01
coef.var 6.076362e-01


Question 6

Summary Statistics for ABV

Summary and descritive statistics we aggregated for both ABV and IBU. This is covered in detail in our presentation, and is displayed above. This data, in combination with the histgrams generated, cover the full shape and behavior of the data provided.



#Question 7 - There seems to be a linear relation to ABV and IBU. We will draw a scatter plot
#Plus, we will find the linear regression line fit so we can predict missing values
reg<-lm(ABV ~ IBU, data = merged_beers)
reg
## 
## Call:
## lm(formula = ABV ~ IBU, data = merged_beers)
## 
## Coefficients:
## (Intercept)          IBU  
##   0.0449303    0.0003508
(r_squared<-summary(lm(ABV~IBU, merged_beers))$r.squared)
## [1] 0.4497332
# Equation of the line : 
coeff=coefficients(reg)
eq = paste0("y = ", round(reg$coefficients[2],5)*100, "*x + ", reg$coefficients[1]*100,1)

scatter_title <- paste0("IBU vs ABV Relationship", "\n\n", eq)

#Asside: to find values where we have IBU but do not have ABV
predicted_values_IBU <- merged_beers %>%
  filter(is.na(merged_beers$ABV) == FALSE, is.na(merged_beers$IBU) == TRUE) 

predicted_values_IBU$IBU <- ifelse((predicted_values_IBU$ABV-reg$coefficients[1])/reg$coefficients[2]>0, 
                                   (predicted_values_IBU$ABV-reg$coefficients[1])/reg$coefficients[2], 
                                   NA)

#Plot scatter plot of ABV vs IBU, with prediected values
ggplot(data=merged_beers, aes(x=IBU, y=ABV, color=State)) +
  geom_jitter() +
  geom_abline(intercept = reg$coefficients[1], slope = reg$coefficients[2], na.rm=TRUE, linetype="dashed") + 
  ggtitle(eq) +
  theme(plot.title = element_text(hjust = 1)) +
  scale_y_continuous(labels = scales::percent, limits = c(NA, max(merged_beers$ABV)+0.005)) +
  ggtitle(scatter_title) +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
  theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
  geom_point(data = predicted_values_IBU, 
             mapping = aes(x = IBU, y = ABV, shape=State), 
             shape = 0, 
             size =4,
             show.legend=TRUE) +
  labs(caption = "Square datapoints represent predicted calculated values where they were missing.")



Question 7

Is there an apparent relationship between the bitterness of the beer and its alcoholic content?

Draw a scatter plot.

As shown above, there does seem to be a linear relationship to ABV and IBU, as mentioned earlier. Utilizing a linear regression we found the relation with an R Squared = 0.4497332. This says that the fit only explains 45% of the variability around the mean.

Our conclusion based on this cursory analysis is that IBU is an ok proxy for predicting Alcohol content, however it does not predict a preponderance of the cases precisely.



#####################################
## THIS IS EXTRA ANALYSIS TO CONSIDER FOR PRESENTATION AND CASE
#####################################
## IN ORDER TO ROUND OUT THE DISCUSSION FURTHER DATA WAS PULLED
## IN TO ENRICH THE PROVIDED DATASETS
## WE UTILIZE BOTH ALCOHOL CONSUMPTION DATA FROM OPEN ICPSR
## AS WELL AS US CENSUS BUREAU DATA
#####################################
# https://www.openicpsr.org/openicpsr/project/105583/version/V2/view;jsessionid=843DFC2FBDC320D1624BF92319E643FA?path=/openicpsr/105583/fcr:versions/V2/apparent_per_capita_alcohol_consumption.csv&type
##
##https://www2.census.gov/programs-surveys/popproj/datasets/2017/2017-popproj/np2017_d1.csv
#####################################


#Asside: to find na values for style
merged_beers %>%
  filter(is.na(merged_beers$Style) == TRUE)
##   Brewery_id                       BeerName Beer_ID   ABV IBU Style Ounces
## 1         30                Special Release    2210    NA  NA  <NA>     16
## 2         67                  OktoberFiesta    2527 0.053  27  <NA>     12
## 3        161 Kilt Lifter Scottish-Style Ale    1635 0.060  21  <NA>     12
## 4        167                   The CROWLER™    1796    NA  NA  <NA>     32
## 5        167           CAN'D AID Foundation    1790    NA  NA  <NA>     12
##                   BrewerName         City State
## 1        Cedar Creek Brewery Seven Points    TX
## 2   Freetail Brewing Company  San Antonio    TX
## 3 Four Peaks Brewing Company        Tempe    AZ
## 4        Oskar Blues Brewery     Longmont    CO
## 5        Oskar Blues Brewery     Longmont    CO
#####################################
## CREATE MEDIAN ABV MAPPING
#####################################

#rename column in plot_beers for simplicity for mapping
plot_beers <- as_tibble(medians)
plot_beers <- rename(plot_beers, abbr=State)
plot_beers <- left_join(plot_beers, statepop, by = c("abbr" = "abbr"))
#fill in missing values NA=0
#plot_beers[is.na(plot_beers)] <- 0 #<-----Uncomment to place zeros on IBU clorograph
#create a percent column for ABV
plot_beers$ABV_Percent <- plot_beers$MedianABV*100



#lets make a map
usa <- map_data("usa")
w2hr <- map_data("world2Hires")
states <- map_data("state")



#create cap function
simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
        sep="", collapse=" ")
}

#import states
states$region<-sapply(states$region, simpleCap)
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers, by=c("region" = "full")))

#find centroid of state
mean_geo <- as_tibble(plot_beers_states %>%
  group_by(region) %>%
  summarise(lat=mean(c(max(lat), min(lat))), long=mean(c(max(long), min(long)))))


#joins centroids to original data
plot_beers <-inner_join(plot_beers, mean_geo, by=c("full" = "region"))


#add offsets so I can adjust labeling on the map
plot_beers$offset <- ifelse(plot_beers$full %in% c("New Hampshire",
                                                   #"Vermont", 
                                                   "Massachusetts", 
                                                   "Rhode Island",
                                                   "New York",
                                                   "New Jersey",
                                                   "Delaware"), 1, 0)


#add in brewer counts
plot_beers <- inner_join(plot_beers, breweries_count, by=c("abbr" = "State"))
names(plot_beers)[11] <- "count_breweries"

#add in brew counts
plot_beers <- inner_join(plot_beers, brews_count, by=c("abbr" = "State"))
names(plot_beers)[12] <- "count_brews"



#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

#cut out the fluff so I can add it back piece by piece
ditch_the_axes <- theme(
  axis.text = element_blank(),
  axis.line = element_blank(),
  axis.ticks = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.title = element_blank()
)

#####################################
## MAP ABV 
#####################################
#create Median % ABV Map
p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = MedianABV), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', labels = scales::percent) +
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(MedianABV*100), "%")), color="black")+
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(MedianABV*100), "%")), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = "Median Percent ABV By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = "Percent ABV"))
                  
p

#####################################
## CREATE MEDIAN IBU MAPPING
#####################################
#added code here to modify NA labels to "No Data" for clarity
p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = MedianIBU), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white")+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(ifelse(is.na(MedianIBU)==TRUE, "No Data", MedianIBU)))), color="black") +
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(MedianIBU))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = "Median IBU By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = "IBU"))

p

#####################################
## CREATE BREWERY MAPPING
#####################################

#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,11)], by=c("region" = "full")))

#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = count_breweries), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white")+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(count_breweries))), color="black")+
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(count_breweries))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = "Breweries By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = "Breweries"))

p

#####################################
## CREATE BREWS MAPPING
#####################################

#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,12)], by=c("region" = "full")))

#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = count_brews), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white")+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(count_brews))), color="black")+
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(count_brews))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = "Craft Beers By State") +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = "Craft Beers"))

p

#####################################
## IMPORT CENSUS DATA & CONSUMPTION
#####################################


#setwd("/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1")
setwd(root_dir)

#lets get population information for 201x from US Census

if(!file.exists("Data/population.csv")){
  res <- tryCatch(download.file(url="https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-popchg2010_2018.csv", 
                                destfile="Data/population.csv", 
                                method="libcurl"), 
                  error=function(e) 1)
}

#download.file(url="https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-popchg2010_2018.csv", destfile="Data/population.csv", method="libcurl")

population <- read_csv("Data/population.csv")

#download.file(url="https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-popchg2010_2018.csv", destfile="Data/population.csv", method="libcurl")

population <- read_csv("Data/population.csv")



#fix State Capitalization
population$NAME<-sapply(population$NAME, simpleCap)
#fix DC
population$NAME<-ifelse(population$NAME =="District of Columbia", "DC", population$NAME)
#remove unneeded data
remove <- with(population, (NAME == "Northeast Region") | (NAME == "Midwest Region") | (NAME == "South Region") | (NAME == "West Region") | (NAME == "United States") | (NAME == "Puerto Rico") )
population<-population[!remove,]



#lets get consumption data for 2016 - pre downloaded from authorized site 
#@misc{openICPS65:online,
#howpublished = {\url{https://www.openicpsr.org/openicpsr/project/105583/version/V2/download/terms?path=/openicpsr/105583/fcr:versions/V2&type=project}},
#note = {(Accessed on 02/19/2019)}
#}

consumption <- read_csv("Data/apparent_per_capita_alcohol_consumption.csv", 
                        col_type =  cols(
                          state = col_character(),
                          year = col_double(),
                          ethanol_beer_gallons_per_capita = col_double(),
                          ethanol_wine_gallons_per_capita = col_double(),
                          ethanol_spirit_gallons_per_capita = col_double(),
                          ethanol_all_drinks_gallons_per_capita = col_double(),
                          number_of_beers = col_double(),
                          number_of_glasses_wine = col_double(),
                          number_of_shots_liquor = col_double(),
                          number_of_drinks_total = col_double()
                        )
                        )

#fix State Capitalization
consumption$state<-sapply(consumption$state, simpleCap)
#fix DC
consumption$state<-ifelse(consumption$state =="District Of Columbia", "DC", consumption$state)
#remove unneeded data
remove <- with(consumption, (state == "Northeast Region") | (state == "Midwest Region") | (state == "South Region") | (state == "West Region") | (state == "Us Total")  )
consumption<-consumption[!remove,]
#find 2016
consump_2016<-filter(consumption, near(year,2016))
  
#lets pull in population by state
consump_2016<-inner_join(consump_2016, population, by=c("state"="NAME"))

gallons_beer_by_state <- consump_2016 %>%
  group_by(state) %>%
  summarise(gallons_beer = ethanol_beer_gallons_per_capita*POPESTIMATE2016)#, beer_per_capita=ethanol_beer_gallons_per_capita)


#add in gallons counts
plot_beers <- inner_join(plot_beers, gallons_beer_by_state, by=c("full" = "state"))
#names(plot_beers)[13] <- "gallons_beer"

#plot_beers <- inner_join(plot_beers, gallons_beer_by_state, by=c("full" = "state"))
#names(plot_beers)[13] <- "gallons_beer"

#add in population and per capita 
#join for lats and longs
plot_beers <- inner_join(plot_beers, consump_2016[,c(1,3,22)], by=c("full" = "state"))
names(plot_beers)[14] <- "gallons_beer_per_capita"
names(plot_beers)[15] <- "population_2016"



#####################################
## CREATE Gallons MAPPING
#####################################

#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,13)], by=c("region" = "full")))

#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = gallons_beer), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(gallons_beer/1000000, digits=2)))), color="black") +
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(gallons_beer/1000000, digits=2)))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = "Gallons Beer Consumed By State ('000,000's)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = "Gallons Beer Consumed"))

p

#####################################
## CREATE MUSTANG SCORE MAPPING
## THIS REPRESENTS THE PUNCHLINE FOR RECOMMENDATIONS ON HOPSY
#####################################

#Lets construct a scoring mechanism based on weighted averages
#Market opportunity is a weighted function of several variables
#Beer consumption in the state, the population of the state, the breweries in the state, and the beer diversity
#Along with an assumption that ABV and IBU are predictors of "at home" demand
#We will construct a score based on the sum of weighted averages for these metrics
#Scoring will be scaled from 0 --> 100 as an aggregate for all states (e.g. the sum of all state scores sum to 100)

total_pop <- sum(plot_beers$population_2016, na.rm=TRUE)
total_gallons <- sum(plot_beers$gallons_beer, na.rm=TRUE)
total_gallons_per_capita <- sum(plot_beers$gallons_beer_per_capita, na.rm=TRUE)
total_breweries <- sum(plot_beers$count_breweries, na.rm=TRUE)
total_brews <- sum(plot_beers$count_brews, na.rm=TRUE)
total_medians <- sum(plot_beers$MedianABV, na.rm=TRUE)
total_ibu <- sum(plot_beers$MedianIBU, na.rm=TRUE)

plot_beers$market_score <- 100*(1/6)*(ifelse(is.na(plot_beers$gallons_beer_per_capita)==TRUE, 0, 0.8*plot_beers$gallons_beer_per_capita)/total_gallons_per_capita + 
          ifelse(is.na(plot_beers$population_2016)==TRUE, 0, 1*plot_beers$population_2016)/total_pop + 
          ifelse(is.na(plot_beers$count_breweries)==TRUE, 0, 1*plot_beers$count_breweries)/total_breweries + 
          ifelse(is.na(plot_beers$count_brews)==TRUE, 0, 1*plot_beers$count_brews)/total_brews +
          ifelse(is.na(plot_beers$MedianABV)==TRUE, 0, 1*plot_beers$MedianABV)/total_medians +
          ifelse(is.na(plot_beers$MedianIBU)==TRUE, 0, 1*plot_beers$MedianIBU) /total_ibu)

#lets see top score areas
head(plot_beers[order(-plot_beers$market_score),], 5)
## # A tibble: 5 x 16
##   abbr  MedianABV MedianIBU fips  full  pop_2015 ABV_Percent   lat   long
##   <chr>     <dbl>     <dbl> <chr> <chr>    <dbl>       <dbl> <dbl>  <dbl>
## 1 CA       0.058         42 06    Cali… 39144818        5.8   37.3 -119. 
## 2 CO       0.0605        40 08    Colo…  5456574        6.05  39.0 -106. 
## 3 TX       0.055         33 48    Texas 27469114        5.5   31.2 -100. 
## 4 MI       0.062         35 26    Mich…  9922576        6.2   44.6  -86.4
## 5 PA       0.057         30 42    Penn… 12802503        5.7   41.0  -77.6
## # … with 7 more variables: offset <dbl>, count_breweries <int>,
## #   count_brews <int>, gallons_beer <dbl>, gallons_beer_per_capita <dbl>,
## #   population_2016 <dbl>, market_score <dbl>
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,16)], by=c("region" = "full")))

#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = market_score), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), color="black") +
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = paste0("Mustang Score","\n","(All Scores Sum to 100)", "\n", "Equal Weights")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = paste0("Mustang Score","\n","(Higher Is Better)")))

p

#####################################
## CREATE NEW GRAPHIC WITH DOUBLE WEIGHTED IBU
#####################################


plot_beers$market_score <- 100*(1/6)*(ifelse(is.na(plot_beers$gallons_beer_per_capita)==TRUE, 0, 0.8*plot_beers$gallons_beer_per_capita)/total_gallons_per_capita +  
            ifelse(is.na(plot_beers$population_2016)==TRUE, 0, 0.8*plot_beers$population_2016)/total_pop + 
            ifelse(is.na(plot_beers$count_breweries)==TRUE, 0, 0.8*plot_beers$count_breweries)/total_breweries + 
            ifelse(is.na(plot_beers$count_brews)==TRUE, 0, 0.8*plot_beers$count_brews)/total_brews +
            ifelse(is.na(plot_beers$MedianABV)==TRUE, 0, 0.8*plot_beers$MedianABV)/total_medians +
            ifelse(is.na(plot_beers$MedianIBU)==TRUE, 0, 2*plot_beers$MedianIBU) /total_ibu)

#lets see top score areas
head(plot_beers[order(-plot_beers$market_score),], 5)
## # A tibble: 5 x 16
##   abbr  MedianABV MedianIBU fips  full  pop_2015 ABV_Percent   lat   long
##   <chr>     <dbl>     <dbl> <chr> <chr>    <dbl>       <dbl> <dbl>  <dbl>
## 1 CA       0.058         42 06    Cali… 39144818        5.8   37.3 -119. 
## 2 CO       0.0605        40 08    Colo…  5456574        6.05  39.0 -106. 
## 3 TX       0.055         33 48    Texas 27469114        5.5   31.2 -100. 
## 4 MI       0.062         35 26    Mich…  9922576        6.2   44.6  -86.4
## 5 FL       0.057         55 12    Flor… 20271272        5.7   28.1  -83.8
## # … with 7 more variables: offset <dbl>, count_breweries <int>,
## #   count_brews <int>, gallons_beer <dbl>, gallons_beer_per_capita <dbl>,
## #   population_2016 <dbl>, market_score <dbl>
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,16)], by=c("region" = "full")))

#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = market_score), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), color="black") +
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = paste0("Mustang Score","\n","(All Scores Sum to 100)", "\n", "2x IBU Weight")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = paste0("Mustang Score","\n","(Higher Is Better)")))

p

#####################################
## CREATE NEW GRAPHIC WITH DOUBLE WEIGHTED ABV
#####################################


plot_beers$market_score <- 100*(1/6)*(ifelse(is.na(plot_beers$gallons_beer_per_capita)==TRUE, 0, 0.8*plot_beers$gallons_beer_per_capita)/total_gallons_per_capita + 
            ifelse(is.na(plot_beers$population_2016)==TRUE, 0, 0.8*plot_beers$population_2016)/total_pop + 
            ifelse(is.na(plot_beers$count_breweries)==TRUE, 0, 0.8*plot_beers$count_breweries)/total_breweries + 
            ifelse(is.na(plot_beers$count_brews)==TRUE, 0, 0.8*plot_beers$count_brews)/total_brews +
            ifelse(is.na(plot_beers$MedianABV)==TRUE, 0, 2*plot_beers$MedianABV)/total_medians +
            ifelse(is.na(plot_beers$MedianIBU)==TRUE, 0, 0.8*plot_beers$MedianIBU) /total_ibu)

#lets see top score areas
head(plot_beers[order(-plot_beers$market_score),], 5)
## # A tibble: 5 x 16
##   abbr  MedianABV MedianIBU fips  full  pop_2015 ABV_Percent   lat   long
##   <chr>     <dbl>     <dbl> <chr> <chr>    <dbl>       <dbl> <dbl>  <dbl>
## 1 CA       0.058         42 06    Cali… 39144818        5.8   37.3 -119. 
## 2 CO       0.0605        40 08    Colo…  5456574        6.05  39.0 -106. 
## 3 TX       0.055         33 48    Texas 27469114        5.5   31.2 -100. 
## 4 MI       0.062         35 26    Mich…  9922576        6.2   44.6  -86.4
## 5 PA       0.057         30 42    Penn… 12802503        5.7   41.0  -77.6
## # … with 7 more variables: offset <dbl>, count_breweries <int>,
## #   count_brews <int>, gallons_beer <dbl>, gallons_beer_per_capita <dbl>,
## #   population_2016 <dbl>, market_score <dbl>
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,16)], by=c("region" = "full")))

#create a base map
us_base <- ggplot(data = plot_beers_states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) #+

p <- us_base +
  geom_polygon(aes(x = long, y = lat, group = group, fill = market_score), color = "white") +
  scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
  geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
  theme_bw() +
  ditch_the_axes +
  geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), color="black") +
  geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), 
                  box.padding = unit(0.4, "lines"),
                  point.padding = unit(0.2, "lines"),
                  direction="x", 
                  nudge_x = 0.7) +
  labs(title = paste0("Mustang Score","\n","(All Scores Sum to 100)", "\n", "2x ABV Weight")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill = guide_legend(title = paste0("Mustang Score","\n","(Higher Is Better)")))

p



############################
## GENERATE ALL CODEBOOKS
############################]#####################################
## GENERATE CODEBOOKS FOR ALL RAW DATASETS
## PLEASE NOTE CODEBOOKS ONLY INCLUDE ORIGINAL DATA
## TRANSITORY AND DERIVATIVE DATASETS ARE NOT INCLUDED
#####################################

setwd(paste0(root_dir,"/Codebook"))

  suppressWarnings(suppressMessages(makeCodebook(beers, codebook = TRUE, replace=TRUE)))
  suppressWarnings(suppressMessages(makeCodebook(breweries, codebook = TRUE, replace=TRUE)))
  suppressWarnings(suppressMessages(makeCodebook(population, codebook = TRUE, replace=TRUE)))
  suppressWarnings(suppressMessages(makeCodebook(consumption, codebook = TRUE, replace=TRUE)))

setwd(root_dir)



Conclusion - Hopsy Data Analysis

For this study R Mustangs was tasked with analyzing both Beers and Breweries data sets for the United States. The premise of this study is that Hopsy, a premier source-to-consumer beer distributor, is expanding their Market Penetration Strategy and would like to targeted areas and for signing up new Craft Brewers. Additionally, they plan on exploring cross-state distributions opportunities. This would entail selling from their existing stable of supplier beers, to new states with high market opportunity for their Craft beer consumption.

R Mustangs was ask to:

While the data had some gaps (1005 missing values for IBU, 62 missing values for ABV, and 5 missing values for Styles), there was plenty of data to work with. It was found from this study that regional preferences on beer as well as State-level factors impact how many Breweries are in each state. It was also found that ABV and IBU are linearly corrolated and that IBU can serve as a relative proxy to determine overall categorical ABV percentiles.

In order to hav a fulsome conversation on Market Dynamics, the R Mustang team pulled in both Census population data and estimated per capita consumption data. This allowed us to visualize State-by-State differences. And, more importantly, it allowed us to introduce a framework for evaluating States for further discussion, prioritization, and future market penetration. We introduce the concept of the Mustang Scoring Framework, and through this initial analysis recommend that Hopsy review California, Colorado, Texas, Michigan, and Pennsylvania for the next States Hopsy should target. If those States already have penetration plans, the Mustang Scoring Framework provides a crystallized ranking system for review in order to determine the next State to target.

For future work, we would recommend analysis of pricing and incorporating that into the overall data framework. We would also recommend updating all datasets with the most current data lineages.

We thank Hopsy for the opportunity to serve them and hope that we may be of service again in the very near future.